Set the working dir
setwd("/mnt/picea/projects/arabidopsis/mschmid/porcupine-RNA-Seq")
Load libraries
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(LSD))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(scatterplot3d))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(vsn))
Source some helper functions
source("~/Git/UPSCb/src/R/plot.multidensity.R")
Create a palette
pal <- brewer.pal(8,"Dark2")
Register the default plot margin
mar <- par("mar")
Read the sample information
samples <- read.csv("~/Git/UPSCb/projects/arabidopsis-porcupine-RNA-Seq/doc/RNA-seq_pcp_vs_Col-0_info.csv")
Read the tx2gene translation
tx2gene <- read.table("/mnt/picea/storage/reference/Arabidopsis-thaliana/TAIR10/kallisto/tx2gene.txt",
header=TRUE)
tx2gene <- tx2gene[,2:1]
We start with these as they have been split in multiple files
Read the re-sequencing data; list the files
rerun <- list.files("rerun/kallisto",
recursive = TRUE,
pattern = "abundance.tsv",
full.names = TRUE)
name them
names(rerun) <- sub("_sortmerna.*","",
sapply(strsplit(rerun, "/"), .subset, 3))
create a rerun sample file
samples.2 <- data.frame(ID=names(rerun),
Parent=sub("_.*","",names(rerun)),
stringsAsFactors=FALSE)
Read the expression at the transcript level
tx2 <- suppressMessages(tximport(files = rerun, type = "kallisto",
txOut = TRUE))
kt2 <- round(tx2$counts)
And summarize it at the gene level
kg2 <- round(summarizeToGene(txi=tx2,tx2gene=tx2gene)$counts)
## summarizing abundance
## summarizing counts
## summarizing length
orig <- list.files("firstrun/kallisto",
recursive = TRUE,
pattern = "abundance.tsv",
full.names = TRUE)
name them
names(orig) <- sub("_S[0-9]_L[0-9]{3}_sortmerna.*","",
sapply(strsplit(orig, "/"), .subset, 3))
Reorder the sample data.frame according to the way the results were read in and accounting for tech reps
samples <- samples[match(names(orig),samples$SampleID),]
Add some categorical variables
samples$Genotype=sub(" .*","",samples$Description)
samples$Temp=ifelse(sub(".* ","",samples$Description)=="16",
ifelse(nchar(as.character(samples$Description))>16,"WC","C"),
ifelse(nchar(as.character(samples$Description))>16,"CW","W"))
Read the expression at the transcript level
tx <- suppressMessages(tximport(files = orig,
type = "kallisto",
txOut = TRUE))
kt <- round(tx$counts)
And summarize it at the gene level
kg <- round(summarizeToGene(txi=tx,tx2gene=tx2gene)$counts)
## summarizing abundance
## summarizing counts
## summarizing length
ct.2 <- do.call(cbind,
lapply(split.data.frame(t(kt2),
samples.2$Parent),
colSums))
ct <- kt
ct[,match(c("13","14"),colnames(ct))] <-
ct[,match(c("13","14"),colnames(ct))] +
ct.2[,c("pcp-3D-23-1","pcp-3D-23-2")]
cg.2 <- do.call(cbind,
lapply(split.data.frame(t(kg2),
samples.2$Parent),
colSums))
cg <- kg
cg[,match(c("13","14"),colnames(cg))] <-
cg[,match(c("13","14"),colnames(cg))] +
cg.2[,c("pcp-3D-23-1","pcp-3D-23-2")]
Check how many genes are never expressed
sel <- rowSums(kg2) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(kg2),digits=1),
sum(sel),
nrow(kg2))
## [1] "30.3% percent (10165) of 33602 genes are not expressed"
sprintf("%s%% percent (%s) of %s transcripts are not expressed",
round(sum(sel) * 100/ nrow(kt2),digits=1),
sum(sel),
nrow(kt2))
## [1] "24.4% percent (10165) of 41671 transcripts are not expressed"
Display the per-gene mean expression
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
plot(density(log10(rowMeans(kg2))),col=pal[1],
main="gene mean raw counts distribution",
xlab="mean raw counts (log10)")
The cumulative transcript coverage is as expected
plot(density(log10(rowMeans(kt2))),col=pal[1],
main="transcript mean raw counts distribution",
xlab="mean raw counts (log10)")
The same is done for the individual samples colored by condition. The gene and transcript coverage look very similar. The split are all identical, aparts from the 2 with uneven depth (the splits with << 10M reads)
plot.multidensity(lapply(1:ncol(kg2),function(k){log10(kg2)[,k]}),
col=pal[as.integer(as.factor(samples.2$Parent))],
legend.x="topright",
legend=levels(as.factor(samples.2$Parent)),
legend.col=pal[1:nlevels(as.factor(samples.2$Parent))],
legend.lwd=2,
main="sample raw counts distribution",
xlab="per gene raw counts (log10)")
plot.multidensity(lapply(1:ncol(kt2),function(k){log10(kt2)[,k]}),
col=pal[as.integer(as.factor(samples.2$Parent))],
legend.x="topright",
legend=levels(as.factor(samples.2$Parent)),
legend.col=pal[1:nlevels(as.factor(samples.2$Parent))],
legend.lwd=2,
main="sample raw counts distribution",
xlab="per transcript raw counts (log10)")
Check how many genes are never expressed
sel <- rowSums(kg) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(kg),digits=1),
sum(sel),
nrow(kg))
## [1] "17.2% percent (5791) of 33602 genes are not expressed"
sprintf("%s%% percent (%s) of %s transcripts are not expressed",
round(sum(sel) * 100/ nrow(kt),digits=1),
sum(sel),
nrow(kt))
## [1] "13.9% percent (5791) of 41671 transcripts are not expressed"
The cumulative gene coverage is as expected
plot(density(log10(rowMeans(kg))),col=pal[1],
main="gene mean raw counts distribution",
xlab="mean raw counts (log10)")
The cumulative transcript coverage is as expected
plot(density(log10(rowMeans(kt))),col=pal[1],
main="transcript mean raw counts distribution",
xlab="mean raw counts (log10)")
The same is done for the individual samples colored by condition. We oberved the expected bias from the 2 undersequenced smaples
plot.multidensity(lapply(1:ncol(kg),function(k){log10(kg)[,k]}),
col=pal[as.integer(samples$Description)],
legend.x="topright",
legend=levels(samples$Description),
legend.col=pal[1:nlevels(samples$Description)],
legend.lwd=2,
main="sample raw counts distribution",
xlab="per gene raw counts (log10)")
plot.multidensity(lapply(1:ncol(kt),function(k){log10(kt)[,k]}),
col=pal[as.integer(samples$Description)],
legend.x="topright",
legend=levels(samples$Description),
legend.col=pal[1:nlevels(samples$Description)],
legend.lwd=2,
main="sample raw counts distribution",
xlab="per transcript raw counts (log10)")
Check how many genes are never expressed
sel <- rowSums(cg) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(cg),digits=1),
sum(sel),
nrow(cg))
## [1] "16.6% percent (5593) of 33602 genes are not expressed"
sprintf("%s%% percent (%s) of %s transcripts are not expressed",
round(sum(sel) * 100/ nrow(ct),digits=1),
sum(sel),
nrow(ct))
## [1] "13.4% percent (5593) of 41671 transcripts are not expressed"
The cumulative gene coverage is as expected
plot(density(log10(rowMeans(cg))),col=pal[1],
main="gene mean raw counts distribution",
xlab="mean raw counts (log10)")
The cumulative transcript coverage is as expected
plot(density(log10(rowMeans(ct))),col=pal[1],
main="transcript mean raw counts distribution",
xlab="mean raw counts (log10)")
The same is done for the individual samples colored by condition. We oberved the expected bias from the 2 undersequenced smaples
plot.multidensity(lapply(1:ncol(cg),function(k){log10(cg)[,k]}),
col=pal[as.integer(samples$Description)],
legend.x="topright",
legend=levels(samples$Description),
legend.col=pal[1:nlevels(samples$Description)],
legend.lwd=2,
main="sample raw counts distribution",
xlab="per gene raw counts (log10)")
plot.multidensity(lapply(1:ncol(ct),function(k){log10(ct)[,k]}),
col=pal[as.integer(samples$Description)],
legend.x="topright",
legend=levels(samples$Description),
legend.col=pal[1:nlevels(samples$Description)],
legend.lwd=2,
main="sample raw counts distribution",
xlab="per transcript raw counts (log10)")
dir.create(file.path("analysis","kallisto"),showWarnings=FALSE)
write.csv(cg,file="analysis/kallisto/combined-raw-unormalised-gene-expression_data.csv")
write.csv(ct,file="analysis/kallisto/combined-raw-unormalised-transcript-expression_data.csv")
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate ### Re-sequencing data Create the dds object, without giving any prior on the design
conditions <- colnames(kg2)
dds.kg2 <- DESeqDataSetFromMatrix(
countData = kg2,
colData = data.frame(condition=conditions),
design = ~ condition)
## converting counts to integer mode
Check the size factors (i.e. the sequencing library size effect)
dds.kg2 <- estimateSizeFactors(dds.kg2)
sizes.kg2 <- sizeFactors(dds.kg2)
names(sizes.kg2) <- colnames(kg2)
pander(sizes.kg2)
| pcp-3D-23-1_S1_10 | pcp-3D-23-1_S1_11 | pcp-3D-23-1_S1_12 | pcp-3D-23-1_S1_13 |
|---|---|---|---|
| 1.137 | 1.127 | 1.138 | 1.115 |
| pcp-3D-23-1_S1_14 | pcp-3D-23-1_S1_15 | pcp-3D-23-1_S1_16 | pcp-3D-23-1_S1_17 |
|---|---|---|---|
| 1.133 | 1.14 | 1.139 | 1.135 |
| pcp-3D-23-1_S1_18 | pcp-3D-23-1_S1_19 | pcp-3D-23-1_S1_1 | pcp-3D-23-1_S1_20 |
|---|---|---|---|
| 1.135 | 1.133 | 1.133 | 1.138 |
| pcp-3D-23-1_S1_21 | pcp-3D-23-1_S1_22 | pcp-3D-23-1_S1_2 | pcp-3D-23-1_S1_3 |
|---|---|---|---|
| 1.14 | 0.4358 | 1.128 | 1.132 |
| pcp-3D-23-1_S1_4 | pcp-3D-23-1_S1_5 | pcp-3D-23-1_S1_6 | pcp-3D-23-1_S1_7 |
|---|---|---|---|
| 1.139 | 1.134 | 1.134 | 1.133 |
| pcp-3D-23-1_S1_8 | pcp-3D-23-1_S1_9 | pcp-3D-23-2_S2_10 | pcp-3D-23-2_S2_11 |
|---|---|---|---|
| 1.135 | 1.134 | 1.031 | 1.029 |
| pcp-3D-23-2_S2_12 | pcp-3D-23-2_S2_13 | pcp-3D-23-2_S2_14 | pcp-3D-23-2_S2_15 |
|---|---|---|---|
| 1.034 | 1.02 | 1.03 | 1.037 |
| pcp-3D-23-2_S2_16 | pcp-3D-23-2_S2_17 | pcp-3D-23-2_S2_18 | pcp-3D-23-2_S2_19 |
|---|---|---|---|
| 1.037 | 1.032 | 1.035 | 1.035 |
| pcp-3D-23-2_S2_1 | pcp-3D-23-2_S2_20 | pcp-3D-23-2_S2_21 | pcp-3D-23-2_S2_2 |
|---|---|---|---|
| 1.033 | 1.038 | 0.1794 | 1.03 |
| pcp-3D-23-2_S2_3 | pcp-3D-23-2_S2_4 | pcp-3D-23-2_S2_5 | pcp-3D-23-2_S2_6 |
|---|---|---|---|
| 1.034 | 1.035 | 1.029 | 1.03 |
| pcp-3D-23-2_S2_7 | pcp-3D-23-2_S2_8 | pcp-3D-23-2_S2_9 |
|---|---|---|
| 1.031 | 1.031 | 1.042 |
boxplot(sizes.kg2, main="Sequencing libraries size factor")
At the transcript level
dds.kt2 <- DESeqDataSetFromMatrix(
countData = kt2,
colData = data.frame(condition=conditions),
design = ~ condition)
## converting counts to integer mode
Check the size factors (i.e. the sequencing library size effect)
dds.kt2 <- estimateSizeFactors(dds.kt2)
sizes.kt2 <- sizeFactors(dds.kt2)
names(sizes.kt2) <- colnames(kt2)
pander(sizes.kt2)
| pcp-3D-23-1_S1_10 | pcp-3D-23-1_S1_11 | pcp-3D-23-1_S1_12 | pcp-3D-23-1_S1_13 |
|---|---|---|---|
| 1.128 | 1.118 | 1.127 | 1.11 |
| pcp-3D-23-1_S1_14 | pcp-3D-23-1_S1_15 | pcp-3D-23-1_S1_16 | pcp-3D-23-1_S1_17 |
|---|---|---|---|
| 1.125 | 1.135 | 1.132 | 1.128 |
| pcp-3D-23-1_S1_18 | pcp-3D-23-1_S1_19 | pcp-3D-23-1_S1_1 | pcp-3D-23-1_S1_20 |
|---|---|---|---|
| 1.128 | 1.126 | 1.126 | 1.131 |
| pcp-3D-23-1_S1_21 | pcp-3D-23-1_S1_22 | pcp-3D-23-1_S1_2 | pcp-3D-23-1_S1_3 |
|---|---|---|---|
| 1.135 | 0.4342 | 1.121 | 1.122 |
| pcp-3D-23-1_S1_4 | pcp-3D-23-1_S1_5 | pcp-3D-23-1_S1_6 | pcp-3D-23-1_S1_7 |
|---|---|---|---|
| 1.132 | 1.13 | 1.122 | 1.126 |
| pcp-3D-23-1_S1_8 | pcp-3D-23-1_S1_9 | pcp-3D-23-2_S2_10 | pcp-3D-23-2_S2_11 |
|---|---|---|---|
| 1.128 | 1.13 | 1.041 | 1.037 |
| pcp-3D-23-2_S2_12 | pcp-3D-23-2_S2_13 | pcp-3D-23-2_S2_14 | pcp-3D-23-2_S2_15 |
|---|---|---|---|
| 1.042 | 1.03 | 1.041 | 1.047 |
| pcp-3D-23-2_S2_16 | pcp-3D-23-2_S2_17 | pcp-3D-23-2_S2_18 | pcp-3D-23-2_S2_19 |
|---|---|---|---|
| 1.045 | 1.042 | 1.046 | 1.044 |
| pcp-3D-23-2_S2_1 | pcp-3D-23-2_S2_20 | pcp-3D-23-2_S2_21 | pcp-3D-23-2_S2_2 |
|---|---|---|---|
| 1.041 | 1.05 | 0.1805 | 1.04 |
| pcp-3D-23-2_S2_3 | pcp-3D-23-2_S2_4 | pcp-3D-23-2_S2_5 | pcp-3D-23-2_S2_6 |
|---|---|---|---|
| 1.044 | 1.047 | 1.04 | 1.037 |
| pcp-3D-23-2_S2_7 | pcp-3D-23-2_S2_8 | pcp-3D-23-2_S2_9 |
|---|---|---|
| 1.04 | 1.039 | 1.052 |
boxplot(sizes.kt2, main="Sequencing libraries size factor")
Just to compare the estimated sizes
plot(sizes.kt2,sizes.kg2,main="library sizes",xlab="transcript",ylab="gene")
abline(0,1)
Create the dds object, without giving any prior on the design
conditions <- colnames(kg)
dds.kg <- DESeqDataSetFromMatrix(
countData = kg,
colData = data.frame(condition=conditions),
design = ~ condition)
## converting counts to integer mode
Check the size factors (i.e. the sequencing library size effect)
dds.kg <- estimateSizeFactors(dds.kg)
sizes.kg <- sizeFactors(dds.kg)
names(sizes.kg) <- colnames(kg)
pander(sizes.kg)
| 13 | 14 | 15 | 16 | 17 | 18 | 1 | 25 | 26 | 27 | 28 | 29 | 2 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.07284 | 0.2717 | 1.45 | 1.193 | 1.27 | 1.289 | 0.8732 | 1.35 | 1.393 | 1.247 | 1.286 | 1.241 | 1.062 |
| 30 | 37 | 38 | 39 | 3 | 40 | 41 | 42 | 4 | 5 | 6 |
|---|---|---|---|---|---|---|---|---|---|---|
| 1.218 | 1.249 | 1.081 | 1.094 | 1.057 | 1.882 | 1.21 | 1.01 | 1.411 | 1.064 | 0.8811 |
boxplot(sizes.kg, main="Sequencing libraries size factor")
At the transcript level
dds.kt <- DESeqDataSetFromMatrix(
countData = kt,
colData = data.frame(condition=conditions),
design = ~ condition)
## converting counts to integer mode
Check the size factors (i.e. the sequencing library size effect)
dds.kt <- estimateSizeFactors(dds.kt)
sizes.kt <- sizeFactors(dds.kt)
names(sizes.kt) <- colnames(kt)
pander(sizes.kt)
| 13 | 14 | 15 | 16 | 17 | 18 | 1 | 25 | 26 | 27 | 28 | 29 | 2 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.0741 | 0.2732 | 1.453 | 1.195 | 1.272 | 1.29 | 0.8753 | 1.35 | 1.394 | 1.25 | 1.288 | 1.242 | 1.062 |
| 30 | 37 | 38 | 39 | 3 | 40 | 41 | 42 | 4 | 5 | 6 |
|---|---|---|---|---|---|---|---|---|---|---|
| 1.218 | 1.251 | 1.082 | 1.097 | 1.058 | 1.886 | 1.212 | 1.014 | 1.412 | 1.066 | 0.8826 |
boxplot(sizes.kt, main="Sequencing libraries size factor")
Just to compare the estimated sizes
plot(sizes.kt,sizes.kg,main="library sizes",xlab="transcript",ylab="gene")
abline(0,1)
Create the dds object, without giving any prior on the design
conditions <- colnames(kg)
dds.g <- DESeqDataSetFromMatrix(
countData = cg,
colData = data.frame(condition=conditions),
design = ~ condition)
## converting counts to integer mode
Check the size factors (i.e. the sequencing library size effect)
dds.g <- estimateSizeFactors(dds.g)
sizes.g <- sizeFactors(dds.g)
names(sizes.g) <- colnames(cg)
pander(sizes.g)
| 13 | 14 | 15 | 16 | 17 | 18 | 1 | 25 | 26 | 27 | 28 | 29 | 2 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1.22 | 1.187 | 1.19 | 0.9814 | 1.046 | 1.061 | 0.7175 | 1.11 | 1.145 | 1.025 | 1.058 | 1.021 | 0.8738 |
| 30 | 37 | 38 | 39 | 3 | 40 | 41 | 42 | 4 | 5 | 6 |
|---|---|---|---|---|---|---|---|---|---|---|
| 1.002 | 1.028 | 0.8902 | 0.9011 | 0.8717 | 1.547 | 0.9951 | 0.8304 | 1.162 | 0.875 | 0.725 |
boxplot(sizes.g, main="Sequencing libraries size factor")
At the transcript level
dds.t <- DESeqDataSetFromMatrix(
countData = ct,
colData = data.frame(condition=conditions),
design = ~ condition)
## converting counts to integer mode
Check the size factors (i.e. the sequencing library size effect)
dds.t <- estimateSizeFactors(dds.t)
sizes.t <- sizeFactors(dds.t)
names(sizes.t) <- colnames(ct)
pander(sizes.t)
| 13 | 14 | 15 | 16 | 17 | 18 | 1 | 25 | 26 | 27 | 28 | 29 | 2 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1.211 | 1.176 | 1.195 | 0.986 | 1.052 | 1.067 | 0.7211 | 1.114 | 1.151 | 1.031 | 1.064 | 1.026 | 0.8798 |
| 30 | 37 | 38 | 39 | 3 | 40 | 41 | 42 | 4 | 5 | 6 |
|---|---|---|---|---|---|---|---|---|---|---|
| 1.007 | 1.034 | 0.8944 | 0.907 | 0.8739 | 1.558 | 1.001 | 0.8363 | 1.166 | 0.8769 | 0.7272 |
boxplot(sizes.t, main="Sequencing libraries size factor")
Just to compare the estimated sizes
plot(sizes.t,sizes.g,main="library sizes",xlab="transcript",ylab="gene")
abline(0,1)
At the gene level
vsd.kg2 <- varianceStabilizingTransformation(dds.kg2, blind=TRUE)
vst.kg2 <- assay(vsd.kg2)
colnames(vst.kg2) <- colnames(kg2)
vst.kg2 <- vst.kg2 - min(vst.kg2)
Validate the VST
meanSdPlot(vst.kg2[rowSums(kg2)>0,])
At the transcript level
vsd.kt2 <- varianceStabilizingTransformation(dds.kt2, blind=TRUE)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
vst.kt2 <- assay(vsd.kt2)
colnames(vst.kt2) <- colnames(kt2)
vst.kt2 <- vst.kt2 - min(vst.kt2)
Validate the VST
meanSdPlot(vst.kt2[rowSums(kt2)>0,])
At the gene level
vsd.kg <- varianceStabilizingTransformation(dds.kg, blind=TRUE)
vst.kg <- assay(vsd.kg)
colnames(vst.kg) <- colnames(kg)
vst.kg <- vst.kg - min(vst.kg)
Validate the VST
meanSdPlot(vst.kg[rowSums(kg)>0,])
At the transcript level
vsd.kt <- varianceStabilizingTransformation(dds.kt, blind=TRUE)
vst.kt <- assay(vsd.kt)
colnames(vst.kt) <- colnames(kt)
vst.kt <- vst.kt - min(vst.kt)
Validate the VST
meanSdPlot(vst.kt[rowSums(kt)>0,])
At the gene level
vsd.g <- varianceStabilizingTransformation(dds.g, blind=TRUE)
vst.g <- assay(vsd.g)
colnames(vst.g) <- colnames(cg)
vst.g <- vst.g - min(vst.g)
write.csv(vst.g,"analysis/kallisto/combined-library-size-normalized_variance-stabilized_gene-expression_data.csv")
Validate the VST
meanSdPlot(vst.g[rowSums(cg)>0,])
At the transcript level
vsd.t <- varianceStabilizingTransformation(dds.t, blind=TRUE)
vst.t <- assay(vsd.t)
colnames(vst.t) <- colnames(ct)
vst.t <- vst.t - min(vst.t)
write.csv(vst.t,"analysis/kallisto/combined-library-size-normalized_variance-stabilized_transcript-expression_data.csv")
Validate the VST
meanSdPlot(vst.t[rowSums(ct)>0,])
".pca" <- function(vst,fact,lgd="topleft"){
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
#' ### 3 first dimensions
mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
color=pal[as.integer(fact)],
pch=19)
legend(lgd,pch=19,
col=pal[1:nlevels(fact)],
legend=levels(fact))
par(mar=mar)
}
At the gene level
.pca(vst.kg2,factor(samples.2$Parent))
At the transcript level
.pca(vst.kt2,factor(samples.2$Parent))
At the gene level
.pca(vst.kg,factor(samples$Description),lgd="topright")
At the transcript level
.pca(vst.kt,factor(samples$Description),lgd="topright")
At the gene level
.pca(vst.g,factor(samples$Description),lgd="topright")
At the transcript level
.pca(vst.t,factor(samples$Description),lgd="topright")
1000 most variable genes labelled with genotype and temperature and showing per gene expression z-scores.
At the gene level
sel <- order(apply(vst.kg2,1,sd),decreasing=TRUE)[1:1000]
heatmap.2(vst.kg2[sel,],labRow = NA,trace = "none",cexCol = 0.6,scale = "row",
labCol = samples.2$Parent)
At the transcript level
sel <- order(apply(vst.kt2,1,sd),decreasing=TRUE)[1:1000]
heatmap.2(vst.kt2[sel,],labRow = NA,trace = "none",cexCol = 0.6,scale = "row",
labCol = samples.2$Parent)
At the gene level
sel <- order(apply(vst.kg,1,sd),decreasing=TRUE)[1:1000]
heatmap.2(vst.kg[sel,],labRow = NA,trace = "none",cexCol = 0.6,scale = "row",
labCol = paste(samples$Genotype,samples$Temp))
At the transcript level
sel <- order(apply(vst.kt,1,sd),decreasing=TRUE)[1:1000]
heatmap.2(vst.kt[sel,],labRow = NA,trace = "none",cexCol = 0.6,scale = "row",
labCol = paste(samples$Genotype,samples$Temp))
At the gene level The classification is as expected from the HTSeq analysis
sel <- order(apply(vst.g,1,sd),decreasing=TRUE)[1:1000]
heatmap.2(vst.g[sel,],labRow = NA,trace = "none",cexCol = 0.6,scale = "row",
labCol = paste(samples$Genotype,samples$Temp))
At the transcript level It is interesting to observe that some samples cluster differently, indicating (i) a possible technical bias in quantifying transcripts and/or (ii) a true natural variability
sel <- order(apply(vst.t,1,sd),decreasing=TRUE)[1:1000]
heatmap.2(vst.t[sel,],labRow = NA,trace = "none",cexCol = 0.6,scale = "row",
labCol = paste(samples$Genotype,samples$Temp))
pc <- prcomp(t(vst.g))
percent <- round(summary(pc)$importance[2,]*100)
The first dimension separates the 2 “final” temperature, whereas the second dimension separates to some extend the genotypes. The temperature, or its shift, also affect that second dimension
plot(pc$x[,1],
pc$x[,2],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
col=pal[as.integer(samples$Description)],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("top",pch=19,
col=pal[1:nlevels(samples$Description)],
legend=levels(samples$Description))
Just to make the point clearer, plotting the temperature categorical data (C,W,CW,WC)
plot(pc$x[,1],
pc$x[,2],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
col=pal[as.integer(factor(samples$Temp))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("top",pch=19,
col=pal[1:nlevels(factor(samples$Temp))],
legend=levels(factor(samples$Temp)))
And the same for the temperature at collection
plot(pc$x[,1],
pc$x[,2],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
col=pal[as.integer(factor(samples$Temperature.at.collection))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("top",pch=19,
col=pal[1:nlevels(factor(samples$Temperature.at.collection))],
legend=levels(factor(samples$Temperature.at.collection)))
And the same for the age
plot(pc$x[,1],
pc$x[,2],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
col=pal[as.integer(factor(samples$Age.in.days))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("top",pch=19,
col=pal[1:nlevels(factor(samples$Age.in.days))],
legend=levels(factor(samples$Age.in.days)))
Which to some extend separates the pcp from the Col-0 genotypes
plot(pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
col=pal[as.integer(samples$Description)],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topleft",pch=19,
col=pal[1:nlevels(samples$Description)],
legend=levels(samples$Description))
And the same plotting the genotype
plot(pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
col=pal[as.integer(factor(samples$Genotype))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topleft",pch=19,
col=pal[1:nlevels(factor(samples$Genotype))],
legend=levels(factor(samples$Genotype)))
And finally the same for the age
plot(pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
col=pal[as.integer(factor(samples$Age.in.days))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topleft",pch=19,
col=pal[1:nlevels(factor(samples$Age.in.days))],
legend=levels(factor(samples$Age.in.days)))
pc <- prcomp(t(vst.t))
percent <- round(summary(pc)$importance[2,]*100)
The first dimension separates the 2 “final” temperature, whereas the second dimension separates to some extend the genotypes. The temperature, or its shift, also affect that second dimension
plot(pc$x[,1],
pc$x[,2],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
col=pal[as.integer(samples$Description)],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("bottom",pch=19,
col=pal[1:nlevels(samples$Description)],
legend=levels(samples$Description))
Just to make the point clearer, plotting the temperature categorical data (C,W,CW,WC)
plot(pc$x[,1],
pc$x[,2],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
col=pal[as.integer(factor(samples$Temp))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("bottom",pch=19,
col=pal[1:nlevels(factor(samples$Temp))],
legend=levels(factor(samples$Temp)))
And the same for the temperature at collection
plot(pc$x[,1],
pc$x[,2],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
col=pal[as.integer(factor(samples$Temperature.at.collection))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("bottom",pch=19,
col=pal[1:nlevels(factor(samples$Temperature.at.collection))],
legend=levels(factor(samples$Temperature.at.collection)))
And the same for the age
plot(pc$x[,1],
pc$x[,2],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
col=pal[as.integer(factor(samples$Age.in.days))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("bottom",pch=19,
col=pal[1:nlevels(factor(samples$Age.in.days))],
legend=levels(factor(samples$Age.in.days)))
Which to some extend separates the pcp from the Col-0 genotypes
plot(pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
col=pal[as.integer(samples$Description)],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topleft",pch=19,
col=pal[1:nlevels(samples$Description)],
legend=levels(samples$Description))
And the same plotting the genotype
plot(pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
col=pal[as.integer(factor(samples$Genotype))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topleft",pch=19,
col=pal[1:nlevels(factor(samples$Genotype))],
legend=levels(factor(samples$Genotype)))
And finally the same for the age
plot(pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
col=pal[as.integer(factor(samples$Age.in.days))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topleft",pch=19,
col=pal[1:nlevels(factor(samples$Age.in.days))],
legend=levels(factor(samples$Age.in.days)))
htseq <- read.csv("analysis/HTSeq/library-size-normalized_variance-stabilized_data.csv",
row.names=1)
colnames(htseq) <- sub("X","",colnames(htseq))
klst <- vst.g[match(sub("\\.0","",rownames(htseq)),rownames(vst.g)),
match(colnames(htseq),colnames(vst.g))]
lapply(1:ncol(htseq),function(i){
heatscatter(htseq[,i],
klst[,i],
main=paste("sample",colnames(htseq)[i]),
xlab="HTSeq VST counts",
ylab="Kallisto VST counts")
})
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
##
## [[11]]
## NULL
##
## [[12]]
## NULL
##
## [[13]]
## NULL
##
## [[14]]
## NULL
##
## [[15]]
## NULL
##
## [[16]]
## NULL
##
## [[17]]
## NULL
##
## [[18]]
## NULL
##
## [[19]]
## NULL
##
## [[20]]
## NULL
##
## [[21]]
## NULL
##
## [[22]]
## NULL
##
## [[23]]
## NULL
##
## [[24]]
## NULL
Using Kallisto at the gene or transcript levels gives identical results. Kallisto gives similar results to HTSeq but rescues a non insignificant number of genes, some of which are very highly expressed.
Next step is to perform the DE analysis on the kallisto genes and transcripts and compare these to the HTSeq results.
## R version 3.3.1 (2016-06-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.1 LTS
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] hexbin_1.27.1 vsn_3.42.3
## [3] tximport_1.2.0 scatterplot3d_0.3-37
## [5] RColorBrewer_1.1-2 pander_0.6.0
## [7] LSD_3.0 gplots_3.0.1
## [9] DESeq2_1.14.0 SummarizedExperiment_1.4.0
## [11] Biobase_2.34.0 GenomicRanges_1.26.1
## [13] GenomeInfoDb_1.10.1 IRanges_2.8.1
## [15] S4Vectors_0.12.0 BiocGenerics_0.20.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.8 locfit_1.5-9.1 lattice_0.20-34
## [4] gtools_3.5.0 assertthat_0.1 rprojroot_1.1
## [7] digest_0.6.10 plyr_1.8.4 chron_2.3-47
## [10] backports_1.0.4 acepack_1.4.1 RSQLite_1.0.0
## [13] evaluate_0.10 BiocInstaller_1.24.0 ggplot2_2.2.0
## [16] zlibbioc_1.20.0 lazyeval_0.2.0 data.table_1.9.6
## [19] annotate_1.52.0 gdata_2.17.0 rpart_4.1-10
## [22] Matrix_1.2-7.1 preprocessCore_1.36.0 rmarkdown_1.2
## [25] labeling_0.3 splines_3.3.1 BiocParallel_1.8.1
## [28] geneplotter_1.52.0 stringr_1.1.0 foreign_0.8-67
## [31] RCurl_1.95-4.8 munsell_0.4.3 htmltools_0.3.5
## [34] nnet_7.3-12 tibble_1.2 gridExtra_2.2.1
## [37] htmlTable_1.7 Hmisc_4.0-0 XML_3.98-1.5
## [40] bitops_1.0-6 grid_3.3.1 affy_1.52.0
## [43] xtable_1.8-2 gtable_0.2.0 DBI_0.5-1
## [46] magrittr_1.5 scales_0.4.1 KernSmooth_2.23-15
## [49] stringi_1.1.2 XVector_0.14.0 genefilter_1.56.0
## [52] affyio_1.44.0 limma_3.30.4 latticeExtra_0.6-28
## [55] Formula_1.2-1 tools_3.3.1 survival_2.40-1
## [58] yaml_2.1.14 AnnotationDbi_1.36.0 colorspace_1.3-1
## [61] cluster_2.0.5 caTools_1.17.1 knitr_1.15.1